//@HEADER
// ************************************************************************
//
//                        Kokkos v. 4.0
//       Copyright (2022) National Technology & Engineering
//               Solutions of Sandia, LLC (NTESS).
//
// Under the terms of Contract DE-NA0003525 with NTESS,
// the U.S. Government retains certain rights in this software.
//
// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
// See https://kokkos.org/LICENSE for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//@HEADER

#include <cstdio>
#include <sstream>
#include <iostream>

#include <Kokkos_Core.hpp>

namespace Test {

namespace {

template <class ExecSpace, class ScheduleType>
struct TestTeamPolicy {
  using team_member =
      typename Kokkos::TeamPolicy<ScheduleType, ExecSpace>::member_type;
  using view_type = Kokkos::View<int **, ExecSpace>;

  view_type m_flags;

  TestTeamPolicy(const size_t league_size)
      : m_flags(Kokkos::view_alloc(Kokkos::WithoutInitializing, "flags"),
  // FIXME_OPENMPTARGET temporary restriction for team size to be at least 32
#ifdef KOKKOS_ENABLE_OPENMPTARGET
                Kokkos::TeamPolicy<ScheduleType, ExecSpace>(
                    1, std::is_same<ExecSpace,
                                    Kokkos::Experimental::OpenMPTarget>::value
                           ? 32
                           : 1)
                    .team_size_max(*this, Kokkos::ParallelReduceTag()),
#else
                Kokkos::TeamPolicy<ScheduleType, ExecSpace>(1, 1).team_size_max(
                    *this, Kokkos::ParallelReduceTag()),
#endif
                league_size) {
  }

  struct VerifyInitTag {};

  KOKKOS_INLINE_FUNCTION
  void operator()(const team_member &member) const {
    const int tid =
        member.team_rank() + member.team_size() * member.league_rank();

    m_flags(member.team_rank(), member.league_rank()) = tid;
    static_assert(
        (std::is_same<typename team_member::execution_space, ExecSpace>::value),
        "TeamMember::execution_space is not the same as "
        "TeamPolicy<>::execution_space");
  }

  KOKKOS_INLINE_FUNCTION
  void operator()(const VerifyInitTag &, const team_member &member) const {
    const int tid =
        member.team_rank() + member.team_size() * member.league_rank();

    if (tid != m_flags(member.team_rank(), member.league_rank())) {
      Kokkos::printf("TestTeamPolicy member(%d,%d) error %d != %d\n",
                     member.league_rank(), member.team_rank(), tid,
                     m_flags(member.team_rank(), member.league_rank()));
    }
  }

  // Included for test_small_league_size.
  TestTeamPolicy() : m_flags() {}

  // Included for test_small_league_size.
  struct NoOpTag {};

  KOKKOS_INLINE_FUNCTION
  void operator()(const NoOpTag &, const team_member & /*member*/) const {}

  static void test_small_league_size() {
    int bs = 8;   // batch size (number of elements per batch)
    int ns = 16;  // total number of "problems" to process

    // Calculate total scratch memory space size.
    const int level     = 0;
    int mem_size        = 960;
    const int num_teams = ns / bs;
    Kokkos::TeamPolicy<ExecSpace, NoOpTag> policy(num_teams, Kokkos::AUTO());

    Kokkos::parallel_for(
        policy.set_scratch_size(level, Kokkos::PerTeam(mem_size),
                                Kokkos::PerThread(0)),
        TestTeamPolicy());
  }

  static void test_constructors() {
    constexpr const int smallest_work = 1;
    // FIXME_OPENMPTARGET temporary restriction for team size to be at least 32
#ifdef KOKKOS_ENABLE_OPENMPTARGET
    Kokkos::TeamPolicy<ExecSpace, NoOpTag> none_auto(
        smallest_work,
        std::is_same<ExecSpace, Kokkos::Experimental::OpenMPTarget>::value
            ? 32
            : smallest_work,
        smallest_work);
#else
    Kokkos::TeamPolicy<ExecSpace, NoOpTag> none_auto(
        smallest_work, smallest_work, smallest_work);
#endif
    (void)none_auto;
    Kokkos::TeamPolicy<ExecSpace, NoOpTag> both_auto(
        smallest_work, Kokkos::AUTO(), Kokkos::AUTO());
    (void)both_auto;
    // FIXME_OPENMPTARGET temporary restriction for team size to be at least 32
#ifdef KOKKOS_ENABLE_OPENMPTARGET
    Kokkos::TeamPolicy<ExecSpace, NoOpTag> auto_vector(
        smallest_work,
        std::is_same<ExecSpace, Kokkos::Experimental::OpenMPTarget>::value
            ? 32
            : smallest_work,
        Kokkos::AUTO());
#else
    Kokkos::TeamPolicy<ExecSpace, NoOpTag> auto_vector(
        smallest_work, smallest_work, Kokkos::AUTO());
#endif
    (void)auto_vector;
    Kokkos::TeamPolicy<ExecSpace, NoOpTag> auto_team(
        smallest_work, Kokkos::AUTO(), smallest_work);
    (void)auto_team;
  }

  static void test_for(const size_t league_size) {
    {
      TestTeamPolicy functor(league_size);
      using policy_type = Kokkos::TeamPolicy<ScheduleType, ExecSpace>;
      using policy_type_init =
          Kokkos::TeamPolicy<ScheduleType, ExecSpace, VerifyInitTag>;

      // FIXME_OPENMPTARGET temporary restriction for team size to be at least
      // 32
#ifdef KOKKOS_ENABLE_OPENMPTARGET
      const int team_size =
          policy_type(
              league_size,
              std::is_same<ExecSpace, Kokkos::Experimental::OpenMPTarget>::value
                  ? 32
                  : 1)
              .team_size_max(functor, Kokkos::ParallelForTag());
      const int team_size_init =
          policy_type_init(
              league_size,
              std::is_same<ExecSpace, Kokkos::Experimental::OpenMPTarget>::value
                  ? 32
                  : 1)
              .team_size_max(functor, Kokkos::ParallelForTag());
#else
      const int team_size =
          policy_type(league_size, 1)
              .team_size_max(functor, Kokkos::ParallelForTag());
      const int team_size_init =
          policy_type_init(league_size, 1)
              .team_size_max(functor, Kokkos::ParallelForTag());
#endif

      Kokkos::parallel_for(policy_type(league_size, team_size), functor);
      Kokkos::parallel_for(policy_type_init(league_size, team_size_init),
                           functor);
    }

    test_small_league_size();
    test_constructors();
  }

  struct ReduceTag {};

  using value_type = int64_t;

  KOKKOS_INLINE_FUNCTION
  void operator()(const team_member &member, value_type &update) const {
    update += member.team_rank() + member.team_size() * member.league_rank();
  }

  KOKKOS_INLINE_FUNCTION
  void operator()(const ReduceTag &, const team_member &member,
                  value_type &update) const {
    update +=
        1 + member.team_rank() + member.team_size() * member.league_rank();
  }

  static void test_reduce(const size_t league_size) {
    TestTeamPolicy functor(league_size);

    using policy_type = Kokkos::TeamPolicy<ScheduleType, ExecSpace>;
    using policy_type_reduce =
        Kokkos::TeamPolicy<ScheduleType, ExecSpace, ReduceTag>;

    // FIXME_OPENMPTARGET temporary restriction for team size to be at least 32
#ifdef KOKKOS_ENABLE_OPENMPTARGET
    const int team_size =
        policy_type_reduce(
            league_size,
            std::is_same<ExecSpace, Kokkos::Experimental::OpenMPTarget>::value
                ? 32
                : 1)
            .team_size_max(functor, Kokkos::ParallelReduceTag());
#else
    const int team_size =
        policy_type_reduce(league_size, 1)
            .team_size_max(functor, Kokkos::ParallelReduceTag());
#endif

    const int64_t N = team_size * league_size;

    int64_t total = 0;

    Kokkos::parallel_reduce(policy_type(league_size, team_size), functor,
                            total);
    ASSERT_EQ(size_t((N - 1) * (N)) / 2, size_t(total));

    Kokkos::parallel_reduce(policy_type_reduce(league_size, team_size), functor,
                            total);
    ASSERT_EQ((size_t(N) * size_t(N + 1)) / 2, size_t(total));
  }
};

}  // namespace

}  // namespace Test

/*--------------------------------------------------------------------------*/

namespace Test {

template <typename ScalarType, class DeviceType, class ScheduleType>
class ReduceTeamFunctor {
 public:
  using execution_space = DeviceType;
  using policy_type     = Kokkos::TeamPolicy<ScheduleType, execution_space>;
  using size_type       = typename execution_space::size_type;

  struct value_type {
    ScalarType value[3];
  };

  const size_type nwork;

  KOKKOS_INLINE_FUNCTION
  ReduceTeamFunctor(const size_type &arg_nwork) : nwork(arg_nwork) {}

  KOKKOS_INLINE_FUNCTION
  ReduceTeamFunctor(const ReduceTeamFunctor &rhs) : nwork(rhs.nwork) {}

  KOKKOS_INLINE_FUNCTION
  void init(value_type &dst) const {
    dst.value[0] = 0;
    dst.value[1] = 0;
    dst.value[2] = 0;
  }

  KOKKOS_INLINE_FUNCTION
  void join(value_type &dst, const value_type &src) const {
    dst.value[0] += src.value[0];
    dst.value[1] += src.value[1];
    dst.value[2] += src.value[2];
  }

  KOKKOS_INLINE_FUNCTION
  void operator()(const typename policy_type::member_type ind,
                  value_type &dst) const {
    const int thread_rank =
        ind.team_rank() + ind.team_size() * ind.league_rank();
    const int thread_size = ind.team_size() * ind.league_size();
    const int chunk       = (nwork + thread_size - 1) / thread_size;

    size_type iwork           = chunk * thread_rank;
    const size_type iwork_end = iwork + chunk < nwork ? iwork + chunk : nwork;

    for (; iwork < iwork_end; ++iwork) {
      dst.value[0] += 1;
      dst.value[1] += iwork + 1;
      dst.value[2] += nwork - iwork;
    }
  }
};

}  // namespace Test

namespace {

template <typename ScalarType, class DeviceType, class ScheduleType>
class TestReduceTeam {
 public:
  using execution_space = DeviceType;
  using policy_type     = Kokkos::TeamPolicy<ScheduleType, execution_space>;
  using size_type       = typename execution_space::size_type;

  TestReduceTeam(const size_type &nwork) { run_test(nwork); }

  void run_test(const size_type &nwork) {
    using functor_type =
        Test::ReduceTeamFunctor<ScalarType, execution_space, ScheduleType>;
    using value_type = typename functor_type::value_type;
    using result_type =
        Kokkos::View<value_type, Kokkos::HostSpace, Kokkos::MemoryUnmanaged>;

    enum { Count = 3 };
    enum { Repeat = 100 };

    value_type result[Repeat];

    const uint64_t nw   = nwork;
    const uint64_t nsum = nw % 2 ? nw * ((nw + 1) / 2) : (nw / 2) * (nw + 1);

    policy_type team_exec(nw, 1);

    const unsigned team_size = team_exec.team_size_recommended(
        functor_type(nwork), Kokkos::ParallelReduceTag());
    const unsigned league_size = (nwork + team_size - 1) / team_size;

    team_exec = policy_type(league_size, team_size);

    for (unsigned i = 0; i < Repeat; ++i) {
      result_type tmp(&result[i]);
      Kokkos::parallel_reduce(team_exec, functor_type(nwork), tmp);
    }

    execution_space().fence();

    for (unsigned i = 0; i < Repeat; ++i) {
      for (unsigned j = 0; j < Count; ++j) {
        const uint64_t correct = 0 == j % 3 ? nw : nsum;
        ASSERT_EQ((ScalarType)correct, result[i].value[j]);
      }
    }
  }
};

}  // namespace

/*--------------------------------------------------------------------------*/

namespace Test {

template <class DeviceType, class ScheduleType>
class ScanTeamFunctor {
 public:
  using execution_space = DeviceType;
  using policy_type     = Kokkos::TeamPolicy<ScheduleType, execution_space>;
  using value_type      = int64_t;

  Kokkos::View<value_type, execution_space> accum;
  Kokkos::View<value_type, execution_space> total;

  ScanTeamFunctor() : accum("accum"), total("total") {}

  KOKKOS_INLINE_FUNCTION
  void init(value_type &error) const { error = 0; }

  KOKKOS_INLINE_FUNCTION
  void join(value_type &error, value_type const &input) const {
    if (input) error = 1;
  }

  struct JoinMax {
    using value_type = int64_t;

    KOKKOS_INLINE_FUNCTION
    void join(value_type &dst, value_type const &input) const {
      if (dst < input) dst = input;
    }
  };

  KOKKOS_INLINE_FUNCTION
  void operator()(const typename policy_type::member_type ind,
                  value_type &error) const {
    if (0 == ind.league_rank() && 0 == ind.team_rank()) {
      const int64_t thread_count = ind.league_size() * ind.team_size();
      total()                    = (thread_count * (thread_count + 1)) / 2;
    }

    // Team max:
    int64_t m = (int64_t)(ind.league_rank() + ind.team_rank());
    ind.team_reduce(Kokkos::Max<int64_t>(m));

    if (m != ind.league_rank() + (ind.team_size() - 1)) {
      Kokkos::printf(
          "ScanTeamFunctor[%i.%i of %i.%i] reduce_max_answer(%li) != "
          "reduce_max(%li)\n",
          static_cast<int>(ind.league_rank()),
          static_cast<int>(ind.team_rank()),
          static_cast<int>(ind.league_size()),
          static_cast<int>(ind.team_size()),
          static_cast<long>(ind.league_rank() + (ind.team_size() - 1)),
          static_cast<long>(m));
    }

    // Scan:
    const int64_t answer = (ind.league_rank() + 1) * ind.team_rank() +
                           (ind.team_rank() * (ind.team_rank() + 1)) / 2;

    const int64_t result =
        ind.team_scan(ind.league_rank() + 1 + ind.team_rank() + 1);

    const int64_t result2 =
        ind.team_scan(ind.league_rank() + 1 + ind.team_rank() + 1);

    if (answer != result || answer != result2) {
      Kokkos::printf(
          "ScanTeamFunctor[%i.%i of %i.%i] answer(%li) != scan_first(%li) or "
          "scan_second(%li)\n",
          static_cast<int>(ind.league_rank()),
          static_cast<int>(ind.team_rank()),
          static_cast<int>(ind.league_size()),
          static_cast<int>(ind.team_size()), static_cast<long>(answer),
          static_cast<long>(result), static_cast<long>(result2));

      error = 1;
    }

    const int64_t thread_rank =
        ind.team_rank() + ind.team_size() * ind.league_rank();
    ind.team_scan(1 + thread_rank, accum.data());
  }
};

template <class DeviceType, class ScheduleType>
class TestScanTeam {
 public:
  using execution_space = DeviceType;
  using value_type      = int64_t;
  using policy_type     = Kokkos::TeamPolicy<ScheduleType, execution_space>;
  using functor_type    = Test::ScanTeamFunctor<DeviceType, ScheduleType>;

  TestScanTeam(const size_t nteam) { run_test(nteam); }

  void run_test(const size_t nteam) {
    using result_type =
        Kokkos::View<int64_t, Kokkos::HostSpace, Kokkos::MemoryUnmanaged>;

    const unsigned REPEAT = 100000;
    unsigned Repeat;

    if (nteam == 0) {
      Repeat = 1;
    } else {
      Repeat = (REPEAT + nteam - 1) / nteam;  // Error here.
    }

    functor_type functor;

    policy_type team_exec(nteam, 1);
    const auto team_size =
        team_exec.team_size_max(functor, Kokkos::ParallelReduceTag());
    team_exec = policy_type(nteam, team_size);

    for (unsigned i = 0; i < Repeat; ++i) {
      int64_t accum = 0;
      int64_t total = 0;
      int64_t error = 0;
      Kokkos::deep_copy(functor.accum, total);

      Kokkos::parallel_reduce(team_exec, functor, result_type(&error));
      DeviceType().fence();

      Kokkos::deep_copy(accum, functor.accum);
      Kokkos::deep_copy(total, functor.total);

      ASSERT_EQ(error, 0);
      ASSERT_EQ(total, accum);
    }

    execution_space().fence();
  }
};

}  // namespace Test

/*--------------------------------------------------------------------------*/

namespace Test {

template <class ExecSpace, class ScheduleType>
struct SharedTeamFunctor {
  using execution_space = ExecSpace;
  using value_type      = int;
  using policy_type     = Kokkos::TeamPolicy<ScheduleType, execution_space>;

  enum { SHARED_COUNT = 1000 };

  using shmem_space = typename ExecSpace::scratch_memory_space;

  // TBD: MemoryUnmanaged should be the default for shared memory space.
  using shared_int_array_type =
      Kokkos::View<int *, shmem_space, Kokkos::MemoryUnmanaged>;

  // Tell how much shared memory will be required by this functor.
  inline unsigned team_shmem_size(int /*team_size*/) const {
    return shared_int_array_type::shmem_size(SHARED_COUNT) +
           shared_int_array_type::shmem_size(SHARED_COUNT);
  }

  KOKKOS_INLINE_FUNCTION
  void operator()(const typename policy_type::member_type &ind,
                  value_type &update) const {
    const shared_int_array_type shared_A(ind.team_shmem(), SHARED_COUNT);
    const shared_int_array_type shared_B(ind.team_shmem(), SHARED_COUNT);

    if ((shared_A.data() == nullptr && SHARED_COUNT > 0) ||
        (shared_B.data() == nullptr && SHARED_COUNT > 0)) {
      Kokkos::printf(
          "member( %i/%i , %i/%i ) Failed to allocate shared memory of size "
          "%lu\n",
          static_cast<int>(ind.league_rank()),
          static_cast<int>(ind.league_size()),
          static_cast<int>(ind.team_rank()), static_cast<int>(ind.team_size()),
          static_cast<unsigned long>(SHARED_COUNT));

      ++update;  // Failure to allocate is an error.
    } else {
      for (int i = ind.team_rank(); i < SHARED_COUNT; i += ind.team_size()) {
        shared_A[i] = i + ind.league_rank();
        shared_B[i] = 2 * i + ind.league_rank();
      }

      ind.team_barrier();

      if (ind.team_rank() + 1 == ind.team_size()) {
        for (int i = 0; i < SHARED_COUNT; ++i) {
          if (shared_A[i] != i + ind.league_rank()) {
            ++update;
          }

          if (shared_B[i] != 2 * i + ind.league_rank()) {
            ++update;
          }
        }
      }
    }
  }
};

}  // namespace Test

namespace {

template <class ExecSpace, class ScheduleType>
struct TestSharedTeam {
  TestSharedTeam() { run(); }

  void run() {
    using Functor = Test::SharedTeamFunctor<ExecSpace, ScheduleType>;
    using result_type =
        Kokkos::View<typename Functor::value_type, Kokkos::HostSpace,
                     Kokkos::MemoryUnmanaged>;

#ifdef KOKKOS_ENABLE_OPENMPTARGET
    const size_t team_size =
        std::is_same<ExecSpace, Kokkos::Experimental::OpenMPTarget>::value
            ? Kokkos::TeamPolicy<ScheduleType, ExecSpace>(64, 32).team_size_max(
                  Functor(), Kokkos::ParallelReduceTag())
            : Kokkos::TeamPolicy<ScheduleType, ExecSpace>(8192, 1)
                  .team_size_max(Functor(), Kokkos::ParallelReduceTag());

    Kokkos::TeamPolicy<ScheduleType, ExecSpace> team_exec(
        std::is_same<ExecSpace, Kokkos::Experimental::OpenMPTarget>::value
            ? 32 / team_size
            : 8192 / team_size,
        team_size);
#else
    const size_t team_size =
        Kokkos::TeamPolicy<ScheduleType, ExecSpace>(8192, 1).team_size_max(
            Functor(), Kokkos::ParallelReduceTag());

    Kokkos::TeamPolicy<ScheduleType, ExecSpace> team_exec(8192 / team_size,
                                                          team_size);
#endif

    typename Functor::value_type error_count = 0;

    Kokkos::parallel_reduce(team_exec, Functor(), result_type(&error_count));
    Kokkos::fence();

    ASSERT_EQ(error_count, 0);
  }
};

}  // namespace

namespace Test {

#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
template <class MemorySpace, class ExecSpace, class ScheduleType>
struct TestLambdaSharedTeam {
  TestLambdaSharedTeam() { run(); }

  void run() {
    using Functor     = Test::SharedTeamFunctor<ExecSpace, ScheduleType>;
    using result_type = Kokkos::View<typename Functor::value_type, MemorySpace,
                                     Kokkos::MemoryUnmanaged>;

    using shmem_space = typename ExecSpace::scratch_memory_space;

    // TBD: MemoryUnmanaged should be the default for shared memory space.
    using shared_int_array_type =
        Kokkos::View<int *, shmem_space, Kokkos::MemoryUnmanaged>;

    const int SHARED_COUNT = 1000;
#ifdef KOKKOS_ENABLE_OPENMPTARGET
    int team_size =
        std::is_same<ExecSpace, Kokkos::Experimental::OpenMPTarget>::value ? 32
                                                                           : 1;
#else
    int team_size = 1;
#endif

#ifdef KOKKOS_ENABLE_CUDA
    if (std::is_same<ExecSpace, Kokkos::Cuda>::value) team_size = 128;
#endif

    Kokkos::TeamPolicy<ScheduleType, ExecSpace> team_exec(8192 / team_size,
                                                          team_size);

    int scratch_size = shared_int_array_type::shmem_size(SHARED_COUNT) * 2;
    team_exec = team_exec.set_scratch_size(0, Kokkos::PerTeam(scratch_size));

    typename Functor::value_type error_count = 0;

    Kokkos::parallel_reduce(
        team_exec,
        KOKKOS_LAMBDA(
            const typename Kokkos::TeamPolicy<ScheduleType,
                                              ExecSpace>::member_type &ind,
            int &update) {
          const shared_int_array_type shared_A(ind.team_shmem(), SHARED_COUNT);
          const shared_int_array_type shared_B(ind.team_shmem(), SHARED_COUNT);

          if ((shared_A.data() == nullptr && SHARED_COUNT > 0) ||
              (shared_B.data() == nullptr && SHARED_COUNT > 0)) {
            Kokkos::printf("Failed to allocate shared memory of size %lu\n",
                           static_cast<unsigned long>(SHARED_COUNT));

            ++update;  // Failure to allocate is an error.
          } else {
            for (int i = ind.team_rank(); i < SHARED_COUNT;
                 i += ind.team_size()) {
              shared_A[i] = i + ind.league_rank();
              shared_B[i] = 2 * i + ind.league_rank();
            }

            ind.team_barrier();

            if (ind.team_rank() + 1 == ind.team_size()) {
              for (int i = 0; i < SHARED_COUNT; ++i) {
                if (shared_A[i] != i + ind.league_rank()) {
                  ++update;
                }

                if (shared_B[i] != 2 * i + ind.league_rank()) {
                  ++update;
                }
              }
            }
          }
        },
        result_type(&error_count));

    Kokkos::fence();

    ASSERT_EQ(error_count, 0);
  }
};
#endif

}  // namespace Test

namespace Test {

template <class ExecSpace, class ScheduleType>
struct ScratchTeamFunctor {
  using execution_space = ExecSpace;
  using value_type      = int;
  using policy_type     = Kokkos::TeamPolicy<ScheduleType, execution_space>;

  enum { SHARED_TEAM_COUNT = 100 };
  enum { SHARED_THREAD_COUNT = 10 };

  using shmem_space = typename ExecSpace::scratch_memory_space;

  // TBD: MemoryUnmanaged should be the default for shared memory space.
  using shared_int_array_type =
      Kokkos::View<size_t *, shmem_space, Kokkos::MemoryUnmanaged>;

  KOKKOS_INLINE_FUNCTION
  void operator()(const typename policy_type::member_type &ind,
                  value_type &update) const {
    const shared_int_array_type scratch_ptr(ind.team_scratch(1),
                                            3 * ind.team_size());
    const shared_int_array_type scratch_A(ind.team_scratch(1),
                                          SHARED_TEAM_COUNT);
    const shared_int_array_type scratch_B(ind.thread_scratch(1),
                                          SHARED_THREAD_COUNT);

    if ((scratch_ptr.data() == nullptr) ||
        (scratch_A.data() == nullptr && SHARED_TEAM_COUNT > 0) ||
        (scratch_B.data() == nullptr && SHARED_THREAD_COUNT > 0)) {
      Kokkos::printf("Failed to allocate shared memory of size %lu\n",
                     static_cast<unsigned long>(SHARED_TEAM_COUNT));

      ++update;  // Failure to allocate is an error.
    } else {
      Kokkos::parallel_for(
          Kokkos::TeamThreadRange(ind, 0, (int)SHARED_TEAM_COUNT),
          [&](const int &i) { scratch_A[i] = i + ind.league_rank(); });

      for (int i = 0; i < SHARED_THREAD_COUNT; i++) {
        scratch_B[i] = 10000 * ind.league_rank() + 100 * ind.team_rank() + i;
      }

      scratch_ptr[ind.team_rank()]                   = (size_t)scratch_A.data();
      scratch_ptr[ind.team_rank() + ind.team_size()] = (size_t)scratch_B.data();

      ind.team_barrier();

      for (int i = 0; i < SHARED_TEAM_COUNT; i++) {
        if (scratch_A[i] != size_t(i + ind.league_rank())) ++update;
      }

      for (int i = 0; i < ind.team_size(); i++) {
        if (scratch_ptr[0] != scratch_ptr[i]) ++update;
      }

      if (scratch_ptr[1 + ind.team_size()] - scratch_ptr[0 + ind.team_size()] <
          SHARED_THREAD_COUNT * sizeof(size_t)) {
        ++update;
      }

      for (int i = 1; i < ind.team_size(); i++) {
        if ((scratch_ptr[i + ind.team_size()] -
             scratch_ptr[i - 1 + ind.team_size()]) !=
            (scratch_ptr[1 + ind.team_size()] -
             scratch_ptr[0 + ind.team_size()])) {
          ++update;
        }
      }
    }
  }
};

}  // namespace Test

namespace {

template <class ExecSpace, class ScheduleType>
struct TestScratchTeam {
  TestScratchTeam() { run(); }

  void run() {
    using Functor = Test::ScratchTeamFunctor<ExecSpace, ScheduleType>;
    using result_type =
        Kokkos::View<typename Functor::value_type, Kokkos::HostSpace,
                     Kokkos::MemoryUnmanaged>;
    using p_type = Kokkos::TeamPolicy<ScheduleType, ExecSpace>;

    typename Functor::value_type error_count = 0;

    int thread_scratch_size = Functor::shared_int_array_type::shmem_size(
        Functor::SHARED_THREAD_COUNT);

#ifdef KOKKOS_ENABLE_OPENMPTARGET
    p_type team_exec =
        std::is_same<ExecSpace, Kokkos::Experimental::OpenMPTarget>::value
            ? p_type(64, 32).set_scratch_size(
                  1,
                  Kokkos::PerTeam(Functor::shared_int_array_type::shmem_size(
                      Functor::SHARED_TEAM_COUNT)),
                  Kokkos::PerThread(thread_scratch_size + 3 * sizeof(int)))
            : p_type(8192, 1).set_scratch_size(
                  1,
                  Kokkos::PerTeam(Functor::shared_int_array_type::shmem_size(
                      Functor::SHARED_TEAM_COUNT)),
                  Kokkos::PerThread(thread_scratch_size + 3 * sizeof(int)));
#else
    p_type team_exec = p_type(8192, 1).set_scratch_size(
        1,
        Kokkos::PerTeam(Functor::shared_int_array_type::shmem_size(
            Functor::SHARED_TEAM_COUNT)),
        Kokkos::PerThread(thread_scratch_size + 3 * sizeof(int)));
#endif

    const size_t team_size =
        team_exec.team_size_max(Functor(), Kokkos::ParallelReduceTag());

    int team_scratch_size =
        Functor::shared_int_array_type::shmem_size(Functor::SHARED_TEAM_COUNT) +
        Functor::shared_int_array_type::shmem_size(3 * team_size);

#ifdef KOKKOS_ENABLE_OPENMPTARGET
    team_exec =
        std::is_same<ExecSpace, Kokkos::Experimental::OpenMPTarget>::value
            ? p_type(64 / team_size, team_size)
            : p_type(8192 / team_size, team_size);
#else
    team_exec          = p_type(8192 / team_size, team_size);
#endif

    Kokkos::parallel_reduce(
        team_exec.set_scratch_size(1, Kokkos::PerTeam(team_scratch_size),
                                   Kokkos::PerThread(thread_scratch_size)),
        Functor(), result_type(&error_count));
    Kokkos::fence();
    ASSERT_EQ(error_count, 0);

    Kokkos::parallel_reduce(
        team_exec.set_scratch_size(1, Kokkos::PerTeam(team_scratch_size),
                                   Kokkos::PerThread(thread_scratch_size)),
        Functor(), Kokkos::Sum<typename Functor::value_type>(error_count));
    Kokkos::fence();
    ASSERT_EQ(error_count, 0);
  }
};

}  // namespace

namespace Test {

template <class ExecSpace>
KOKKOS_INLINE_FUNCTION int test_team_mulit_level_scratch_loop_body(
    const typename Kokkos::TeamPolicy<ExecSpace>::member_type &team) {
  Kokkos::View<double *, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>>
      a_team1(team.team_scratch(0), 128);
  Kokkos::View<double *, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>>
      a_thread1(team.thread_scratch(0), 16);
  Kokkos::View<double *, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>>
      a_team2(team.team_scratch(0), 128);
  Kokkos::View<double *, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>>
      a_thread2(team.thread_scratch(0), 16);

  Kokkos::View<double *, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>>
      b_team1(team.team_scratch(1), 12800);
  Kokkos::View<double *, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>>
      b_thread1(team.thread_scratch(1), 1600);
  Kokkos::View<double *, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>>
      b_team2(team.team_scratch(1), 12800);
  Kokkos::View<double *, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>>
      b_thread2(team.thread_scratch(1), 1600);

  Kokkos::View<double *, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>>
      a_team3(team.team_scratch(0), 128);
  Kokkos::View<double *, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>>
      a_thread3(team.thread_scratch(0), 16);
  Kokkos::View<double *, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>>
      b_team3(team.team_scratch(1), 12800);
  Kokkos::View<double *, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>>
      b_thread3(team.thread_scratch(1), 1600);

  // The explicit types for 0 and 128 are here to test TeamThreadRange accepting
  // different types for begin and end.
  Kokkos::parallel_for(Kokkos::TeamThreadRange(team, int(0), unsigned(128)),
                       [&](const int &i) {
                         a_team1(i) = 1000000 + i + team.league_rank() * 100000;
                         a_team2(i) = 2000000 + i + team.league_rank() * 100000;
                         a_team3(i) = 3000000 + i + team.league_rank() * 100000;
                       });
  team.team_barrier();

  Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, int(0), unsigned(16)),
                       [&](const int &i) {
                         a_thread1(i) = 1000000 + 100000 * team.team_rank() +
                                        16 - i + team.league_rank() * 100000;
                         a_thread2(i) = 2000000 + 100000 * team.team_rank() +
                                        16 - i + team.league_rank() * 100000;
                         a_thread3(i) = 3000000 + 100000 * team.team_rank() +
                                        16 - i + team.league_rank() * 100000;
                       });

  Kokkos::parallel_for(Kokkos::TeamThreadRange(team, int(0), unsigned(12800)),
                       [&](const int &i) {
                         b_team1(i) = 1000000 + i + team.league_rank() * 100000;
                         b_team2(i) = 2000000 + i + team.league_rank() * 100000;
                         b_team3(i) = 3000000 + i + team.league_rank() * 100000;
                       });
  team.team_barrier();

  Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, 1600),
                       [&](const int &i) {
                         b_thread1(i) = 1000000 + 100000 * team.team_rank() +
                                        16 - i + team.league_rank() * 100000;
                         b_thread2(i) = 2000000 + 100000 * team.team_rank() +
                                        16 - i + team.league_rank() * 100000;
                         b_thread3(i) = 3000000 + 100000 * team.team_rank() +
                                        16 - i + team.league_rank() * 100000;
                       });

  team.team_barrier();

  int error = 0;
  Kokkos::parallel_for(
      Kokkos::TeamThreadRange(team, 0, 128), [&](const int &i) {
        if (a_team1(i) != 1000000 + i + team.league_rank() * 100000) error++;
        if (a_team2(i) != 2000000 + i + team.league_rank() * 100000) error++;
        if (a_team3(i) != 3000000 + i + team.league_rank() * 100000) error++;
      });
  team.team_barrier();

  Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, 16), [&](const int &i) {
    if (a_thread1(i) != 1000000 + 100000 * team.team_rank() + 16 - i +
                            team.league_rank() * 100000)
      error++;
    if (a_thread2(i) != 2000000 + 100000 * team.team_rank() + 16 - i +
                            team.league_rank() * 100000)
      error++;
    if (a_thread3(i) != 3000000 + 100000 * team.team_rank() + 16 - i +
                            team.league_rank() * 100000)
      error++;
  });

  Kokkos::parallel_for(
      Kokkos::TeamThreadRange(team, 0, 12800), [&](const int &i) {
        if (b_team1(i) != 1000000 + i + team.league_rank() * 100000) error++;
        if (b_team2(i) != 2000000 + i + team.league_rank() * 100000) error++;
        if (b_team3(i) != 3000000 + i + team.league_rank() * 100000) error++;
      });
  team.team_barrier();

  Kokkos::parallel_for(
      Kokkos::ThreadVectorRange(team, 1600), [&](const int &i) {
        if (b_thread1(i) != 1000000 + 100000 * team.team_rank() + 16 - i +
                                team.league_rank() * 100000)
          error++;
        if (b_thread2(i) != 2000000 + 100000 * team.team_rank() + 16 - i +
                                team.league_rank() * 100000)
          error++;
        if (b_thread3(i) != 3000000 + 100000 * team.team_rank() + 16 - i +
                                team.league_rank() * 100000)
          error++;
      });

  return error;
}

struct TagReduce {};
struct TagFor {};

template <class ExecSpace, class ScheduleType>
struct ClassNoShmemSizeFunction {
  using member_type =
      typename Kokkos::TeamPolicy<ExecSpace, ScheduleType>::member_type;

  Kokkos::View<int, ExecSpace, Kokkos::MemoryTraits<Kokkos::Atomic>> errors;

  KOKKOS_INLINE_FUNCTION
  void operator()(const TagFor &, const member_type &team) const {
    int error = test_team_mulit_level_scratch_loop_body<ExecSpace>(team);
    errors() += error;
  }

  KOKKOS_INLINE_FUNCTION
  void operator()(const TagReduce &, const member_type &team,
                  int &error) const {
    error += test_team_mulit_level_scratch_loop_body<ExecSpace>(team);
  }

  void run() {
    Kokkos::View<int, ExecSpace> d_errors =
        Kokkos::View<int, ExecSpace>("Errors");
    errors = d_errors;

    const int per_team0 =
        3 *
        Kokkos::View<double *, ExecSpace,
                     Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(128);
    const int per_thread0 =
        3 *
        Kokkos::View<double *, ExecSpace,
                     Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(16);

    const int per_team1 =
        3 * Kokkos::View<
                double *, ExecSpace,
                Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(12800);
    const int per_thread1 =
        3 *
        Kokkos::View<double *, ExecSpace,
                     Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(1600);

#ifdef KOKKOS_ENABLE_SYCL
    int team_size = 4;
#else
    int team_size      = 8;
#endif
    int const concurrency = ExecSpace().concurrency();
    if (team_size > concurrency) team_size = concurrency;
    {
      Kokkos::TeamPolicy<TagFor, ExecSpace, ScheduleType> policy(10, team_size,
                                                                 16);

      Kokkos::parallel_for(
          policy
              .set_scratch_size(0, Kokkos::PerTeam(per_team0),
                                Kokkos::PerThread(per_thread0))
              .set_scratch_size(1, Kokkos::PerTeam(per_team1),
                                Kokkos::PerThread(per_thread1)),
          *this);
      Kokkos::fence();

      typename Kokkos::View<int, ExecSpace>::HostMirror h_errors =
          Kokkos::create_mirror_view(d_errors);
      Kokkos::deep_copy(h_errors, d_errors);
      ASSERT_EQ(h_errors(), 0);
    }

    {
      int error = 0;
      Kokkos::TeamPolicy<TagReduce, ExecSpace, ScheduleType> policy(
          10, team_size, 16);

      Kokkos::parallel_reduce(
          policy
              .set_scratch_size(0, Kokkos::PerTeam(per_team0),
                                Kokkos::PerThread(per_thread0))
              .set_scratch_size(1, Kokkos::PerTeam(per_team1),
                                Kokkos::PerThread(per_thread1)),
          *this, error);

      ASSERT_EQ(error, 0);
    }
  };
};

template <class ExecSpace, class ScheduleType>
struct ClassWithShmemSizeFunction {
  using member_type =
      typename Kokkos::TeamPolicy<ExecSpace, ScheduleType>::member_type;

  Kokkos::View<int, ExecSpace, Kokkos::MemoryTraits<Kokkos::Atomic>> errors;

  KOKKOS_INLINE_FUNCTION
  void operator()(const TagFor &, const member_type &team) const {
    int error = test_team_mulit_level_scratch_loop_body<ExecSpace>(team);
    errors() += error;
  }

  KOKKOS_INLINE_FUNCTION
  void operator()(const TagReduce &, const member_type &team,
                  int &error) const {
    error += test_team_mulit_level_scratch_loop_body<ExecSpace>(team);
  }

  void run() {
    Kokkos::View<int, ExecSpace> d_errors =
        Kokkos::View<int, ExecSpace>("Errors");
    errors = d_errors;

    const int per_team1 =
        3 * Kokkos::View<
                double *, ExecSpace,
                Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(12800);
    const int per_thread1 =
        3 *
        Kokkos::View<double *, ExecSpace,
                     Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(1600);

    int team_size = 8;

    int const concurrency = ExecSpace().concurrency();
    if (team_size > concurrency) team_size = concurrency;

    {
      Kokkos::TeamPolicy<TagFor, ExecSpace, ScheduleType> policy(10, team_size,
                                                                 16);

      Kokkos::parallel_for(
          policy.set_scratch_size(1, Kokkos::PerTeam(per_team1),
                                  Kokkos::PerThread(per_thread1)),
          *this);
      Kokkos::fence();

      typename Kokkos::View<int, ExecSpace>::HostMirror h_errors =
          Kokkos::create_mirror_view(d_errors);
      Kokkos::deep_copy(h_errors, d_errors);
      ASSERT_EQ(h_errors(), 0);
    }

    {
      int error = 0;
      Kokkos::TeamPolicy<TagReduce, ExecSpace, ScheduleType> policy(
          10, team_size, 16);

      Kokkos::parallel_reduce(
          policy.set_scratch_size(1, Kokkos::PerTeam(per_team1),
                                  Kokkos::PerThread(per_thread1)),
          *this, error);

      ASSERT_EQ(error, 0);
    }
  };

  unsigned team_shmem_size(int team_size) const {
    const int per_team0 =
        3 *
        Kokkos::View<double *, ExecSpace,
                     Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(128);
    const int per_thread0 =
        3 *
        Kokkos::View<double *, ExecSpace,
                     Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(16);
    return per_team0 + team_size * per_thread0;
  }
};

template <class ExecSpace, class ScheduleType>
void test_team_mulit_level_scratch_test_lambda() {
#ifdef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA
  Kokkos::View<int, ExecSpace, Kokkos::MemoryTraits<Kokkos::Atomic>> errors;
  Kokkos::View<int, ExecSpace> d_errors("Errors");
  errors = d_errors;

  const int per_team0 =
      3 *
      Kokkos::View<double *, ExecSpace,
                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(128);
  const int per_thread0 =
      3 * Kokkos::View<double *, ExecSpace,
                       Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(16);

  const int per_team1 =
      3 *
      Kokkos::View<double *, ExecSpace,
                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(12800);
  const int per_thread1 =
      3 *
      Kokkos::View<double *, ExecSpace,
                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(1600);

#ifdef KOKKOS_ENABLE_SYCL
  int team_size = 4;
#else
  int team_size = 8;
#endif
  int const concurrency = ExecSpace().concurrency();
  if (team_size > concurrency) team_size = concurrency;

  Kokkos::TeamPolicy<ExecSpace, ScheduleType> policy(10, team_size, 16);

  Kokkos::parallel_for(
      policy
          .set_scratch_size(0, Kokkos::PerTeam(per_team0),
                            Kokkos::PerThread(per_thread0))
          .set_scratch_size(1, Kokkos::PerTeam(per_team1),
                            Kokkos::PerThread(per_thread1)),
      KOKKOS_LAMBDA(
          const typename Kokkos::TeamPolicy<ExecSpace>::member_type &team) {
        int error = test_team_mulit_level_scratch_loop_body<ExecSpace>(team);
        errors() += error;
      });
  Kokkos::fence();

  typename Kokkos::View<int, ExecSpace>::HostMirror h_errors =
      Kokkos::create_mirror_view(errors);
  Kokkos::deep_copy(h_errors, d_errors);
  ASSERT_EQ(h_errors(), 0);

  int error = 0;
  Kokkos::parallel_reduce(
      policy
          .set_scratch_size(0, Kokkos::PerTeam(per_team0),
                            Kokkos::PerThread(per_thread0))
          .set_scratch_size(1, Kokkos::PerTeam(per_team1),
                            Kokkos::PerThread(per_thread1)),
      KOKKOS_LAMBDA(
          const typename Kokkos::TeamPolicy<ExecSpace>::member_type &team,
          int &count) {
        count += test_team_mulit_level_scratch_loop_body<ExecSpace>(team);
      },
      error);
  ASSERT_EQ(error, 0);
#endif
}

}  // namespace Test

namespace {

template <class ExecSpace, class ScheduleType>
struct TestMultiLevelScratchTeam {
  TestMultiLevelScratchTeam() { run(); }

  void run() {
#ifdef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA
    Test::test_team_mulit_level_scratch_test_lambda<ExecSpace, ScheduleType>();
#endif
    Test::ClassNoShmemSizeFunction<ExecSpace, ScheduleType> c1;
    c1.run();

    Test::ClassWithShmemSizeFunction<ExecSpace, ScheduleType> c2;
    c2.run();
  }
};

}  // namespace

namespace Test {

template <class ExecSpace>
struct TestShmemSize {
  TestShmemSize() { run(); }

  void run() {
    using view_type = Kokkos::View<int64_t ***, ExecSpace>;

    size_t d1 = 5;
    size_t d2 = 6;
    size_t d3 = 7;

    size_t size = view_type::shmem_size(d1, d2, d3);

    ASSERT_EQ(size, (d1 * d2 * d3 + 1) * sizeof(int64_t));

    test_layout_stride();
  }

  void test_layout_stride() {
    int rank       = 3;
    int order[3]   = {2, 0, 1};
    int extents[3] = {100, 10, 3};
    auto s1 =
        Kokkos::View<double ***, Kokkos::LayoutStride, ExecSpace>::shmem_size(
            Kokkos::LayoutStride::order_dimensions(rank, order, extents));
    auto s2 =
        Kokkos::View<double ***, Kokkos::LayoutRight, ExecSpace>::shmem_size(
            extents[0], extents[1], extents[2]);
    ASSERT_EQ(s1, s2);
  }
};

}  // namespace Test

/*--------------------------------------------------------------------------*/

namespace Test {

namespace {

template <class ExecSpace, class ScheduleType, class T, class Enabled = void>
struct TestTeamBroadcast;

template <class ExecSpace, class ScheduleType, class T>
struct TestTeamBroadcast<ExecSpace, ScheduleType, T,
                         std::enable_if_t<(sizeof(T) == sizeof(char)), void>> {
  using team_member =
      typename Kokkos::TeamPolicy<ScheduleType, ExecSpace>::member_type;
  using memory_space = typename ExecSpace::memory_space;
  using value_type   = T;

  const value_type offset;

  TestTeamBroadcast(const size_t /*league_size*/, const value_type os_)
      : offset(os_) {}

  struct BroadcastTag {};

  KOKKOS_INLINE_FUNCTION
  void operator()(const team_member &teamMember, value_type &update) const {
    int lid = teamMember.league_rank();
    int tid = teamMember.team_rank();
    int ts  = teamMember.team_size();

    value_type parUpdate = 0;
    value_type value     = (value_type)(tid % 0xFF) + offset;

    // broadcast boolean and value to team from source thread
    teamMember.team_broadcast(value, lid % ts);

    Kokkos::parallel_reduce(
        Kokkos::TeamThreadRange(teamMember, ts),
        [&](const int /*j*/, value_type &teamUpdate) { teamUpdate |= value; },
        Kokkos::BOr<value_type, memory_space>(parUpdate));

    if (teamMember.team_rank() == 0) update |= parUpdate;
  }

  KOKKOS_INLINE_FUNCTION
  void operator()(const BroadcastTag &, const team_member &teamMember,
                  value_type &update) const {
    int lid = teamMember.league_rank();
    int tid = teamMember.team_rank();
    int ts  = teamMember.team_size();

    value_type parUpdate = 0;
    value_type value     = (value_type)(tid % 0xFF) + offset;

    teamMember.team_broadcast([&](value_type &var) { var -= offset; }, value,
                              lid % ts);

    Kokkos::parallel_reduce(
        Kokkos::TeamThreadRange(teamMember, ts),
        [&](const int /*j*/, value_type &teamUpdate) { teamUpdate |= value; },
        Kokkos::BOr<value_type, memory_space>(parUpdate));

    if (teamMember.team_rank() == 0) update |= parUpdate;
  }

  static void test_teambroadcast(const size_t league_size,
                                 const value_type off) {
    TestTeamBroadcast functor(league_size, off);

    using policy_type = Kokkos::TeamPolicy<ScheduleType, ExecSpace>;
    using policy_type_f =
        Kokkos::TeamPolicy<ScheduleType, ExecSpace, BroadcastTag>;

    // FIXME_OPENMPTARGET temporary restriction for team size to be at least 32
#ifdef KOKKOS_ENABLE_OPENMPTARGET
    int fake_team_size =
        std::is_same<ExecSpace, Kokkos::Experimental::OpenMPTarget>::value ? 32
                                                                           : 1;
#else
    int fake_team_size = 1;
#endif
    const int team_size =
        policy_type_f(league_size, fake_team_size)
            .team_size_max(
                functor,
                Kokkos::
                    ParallelReduceTag());  // printf("team_size=%d\n",team_size);

    // team_broadcast with value
    value_type total = 0;

    Kokkos::parallel_reduce(policy_type(league_size, team_size), functor,
                            Kokkos::BOr<value_type, Kokkos::HostSpace>(total));

    value_type expected_result = 0;
    for (unsigned int i = 0; i < league_size; i++) {
      value_type val = (value_type((i % team_size % 0xFF)) + off);
      expected_result |= val;
    }
    ASSERT_EQ(expected_result, total);
    // printf("team_broadcast with value --"
    //"expected_result=%x,"
    //"total=%x\n",expected_result, total);

    // team_broadcast with function object
    total = 0;

    Kokkos::parallel_reduce(policy_type_f(league_size, team_size), functor,
                            Kokkos::BOr<value_type, Kokkos::HostSpace>(total));

    expected_result = 0;
    for (unsigned int i = 0; i < league_size; i++) {
      value_type val = ((value_type)((i % team_size % 0xFF)));
      expected_result |= val;
    }
    ASSERT_EQ(expected_result, total);
    // printf("team_broadcast with function object --"
    // "expected_result=%x,"
    // "total=%x\n",expected_result, total);
  }
};

template <class ExecSpace, class ScheduleType, class T>
struct TestTeamBroadcast<ExecSpace, ScheduleType, T,
                         std::enable_if_t<(sizeof(T) > sizeof(char)), void>> {
  using team_member =
      typename Kokkos::TeamPolicy<ScheduleType, ExecSpace>::member_type;
  using value_type = T;

  const value_type offset;

  TestTeamBroadcast(const size_t /*league_size*/, const value_type os_)
      : offset(os_) {}

  struct BroadcastTag {};

  KOKKOS_INLINE_FUNCTION
  void operator()(const team_member &teamMember, value_type &update) const {
    int lid = teamMember.league_rank();
    int tid = teamMember.team_rank();
    int ts  = teamMember.team_size();

    value_type parUpdate = 0;
    value_type value     = (value_type)(tid * 3) + offset;

    // setValue is used to determine if the update should be
    // performed at the bottom.  The thread id must match the
    // thread id used to broadcast the value.  It is the
    // thread id that matches the league rank mod team size
    // this way each league rank will use a different thread id
    // which is likely not 0
    bool setValue = ((lid % ts) == tid);

    // broadcast boolean and value to team from source thread
    teamMember.team_broadcast(value, lid % ts);
    teamMember.team_broadcast(setValue, lid % ts);

    Kokkos::parallel_reduce(
        Kokkos::TeamThreadRange(teamMember, ts),
        [&](const int /*j*/, value_type &teamUpdate) { teamUpdate += value; },
        parUpdate);

    if (teamMember.team_rank() == 0 && setValue) update += parUpdate;
  }

  KOKKOS_INLINE_FUNCTION
  void operator()(const BroadcastTag &, const team_member &teamMember,
                  value_type &update) const {
    int lid = teamMember.league_rank();
    int tid = teamMember.team_rank();
    int ts  = teamMember.team_size();

    value_type parUpdate = 0;
    value_type value     = (value_type)(tid * 3) + offset;

    // setValue is used to determine if the update should be
    // performed at the bottom.  The thread id must match the
    // thread id used to broadcast the value.  It is the
    // thread id that matches the league rank mod team size
    // this way each league rank will use a different thread id
    // which is likely not 0. Note the logic is switched from
    // above because the functor switches it back.
    bool setValue = ((lid % ts) != tid);

    teamMember.team_broadcast([&](value_type &var) { var += var; }, value,
                              lid % ts);
    teamMember.team_broadcast([&](bool &bVar) { bVar = !bVar; }, setValue,
                              lid % ts);

    Kokkos::parallel_reduce(
        Kokkos::TeamThreadRange(teamMember, ts),
        [&](const int /*j*/, value_type &teamUpdate) { teamUpdate += value; },
        parUpdate);

    if (teamMember.team_rank() == 0 && setValue) update += parUpdate;
  }

  template <class ScalarType>
  static inline std::enable_if_t<!std::is_integral<ScalarType>::value, void>
  compare_test(ScalarType A, ScalarType B, double epsilon_factor) {
    if (std::is_same<ScalarType, double>::value ||
        std::is_same<ScalarType, float>::value) {
      ASSERT_NEAR((double)A, (double)B,
                  epsilon_factor * std::abs(A) *
                      std::numeric_limits<ScalarType>::epsilon());
    } else {
      ASSERT_EQ(A, B);
    }
  }

  template <class ScalarType>
  static inline std::enable_if_t<std::is_integral<ScalarType>::value, void>
  compare_test(ScalarType A, ScalarType B, double) {
    ASSERT_EQ(A, B);
  }

  static void test_teambroadcast(const size_t league_size,
                                 const value_type off) {
    TestTeamBroadcast functor(league_size, off);

    using policy_type = Kokkos::TeamPolicy<ScheduleType, ExecSpace>;
    using policy_type_f =
        Kokkos::TeamPolicy<ScheduleType, ExecSpace, BroadcastTag>;

    // FIXME_OPENMPTARGET temporary restriction for team size to be at least 32
#ifdef KOKKOS_ENABLE_OPENMPTARGET
    int fake_team_size =
        std::is_same<ExecSpace, Kokkos::Experimental::OpenMPTarget>::value ? 32
                                                                           : 1;
#else
    int fake_team_size = 1;
#endif
    const int team_size =
        policy_type_f(league_size, fake_team_size)
            .team_size_max(
                functor,
                Kokkos::
                    ParallelReduceTag());  // printf("team_size=%d\n",team_size);
    // team_broadcast with value
    value_type total = 0;

    Kokkos::parallel_reduce(policy_type(league_size, team_size), functor,
                            total);

    value_type expected_result = 0;
    for (unsigned int i = 0; i < league_size; i++) {
      value_type val =
          (value_type((i % team_size) * 3) + off) * value_type(team_size);
      expected_result += val;
    }
    // For comparison purposes treat the reduction as a random walk in the
    // least significant digit, which gives a typical walk distance
    // sqrt(league_size) Add 4x for larger sigma
    compare_test(expected_result, total, 4.0 * std::sqrt(league_size));

    // team_broadcast with function object
    total = 0;

    Kokkos::parallel_reduce(policy_type_f(league_size, team_size), functor,
                            total);

    expected_result = 0;
    for (unsigned int i = 0; i < league_size; i++) {
      value_type val = ((value_type)((i % team_size) * 3) + off) *
                       (value_type)(2 * team_size);
      expected_result += val;
    }
    // For comparison purposes treat the reduction as a random walk in the
    // least significant digit, which gives a typical walk distance
    // sqrt(league_size) Add 4x for larger sigma
    compare_test(expected_result, total, 4.0 * std::sqrt(league_size));
  }
};

template <class ExecSpace>
struct TestScratchAlignment {
  struct TestScalar {
    double x, y, z;
  };
  TestScratchAlignment() {
    test_view(true);
    test_view(false);
    test_minimal();
    test_raw();
  }
  using ScratchView =
      Kokkos::View<TestScalar *, typename ExecSpace::scratch_memory_space>;
  using ScratchViewInt =
      Kokkos::View<int *, typename ExecSpace::scratch_memory_space>;
  void test_view(bool allocate_small) {
    int shmem_size = ScratchView::shmem_size(11);
    // FIXME_OPENMPTARGET temporary restriction for team size to be at least 32
#ifdef KOKKOS_ENABLE_OPENMPTARGET
    int team_size =
        std::is_same<ExecSpace, Kokkos::Experimental::OpenMPTarget>::value ? 32
                                                                           : 1;
#else
    int team_size      = 1;
#endif
    if (allocate_small) shmem_size += ScratchViewInt::shmem_size(1);
    Kokkos::parallel_for(
        Kokkos::TeamPolicy<ExecSpace>(1, team_size)
            .set_scratch_size(0, Kokkos::PerTeam(shmem_size)),
        KOKKOS_LAMBDA(
            const typename Kokkos::TeamPolicy<ExecSpace>::member_type &team) {
          if (allocate_small) ScratchViewInt(team.team_scratch(0), 1);
          ScratchView a(team.team_scratch(0), 11);
          if (ptrdiff_t(a.data()) % sizeof(TestScalar) != 0)
            Kokkos::abort("Error: invalid scratch view alignment\n");
        });
    Kokkos::fence();
  }

  // test really small size of scratch space, produced error before
  void test_minimal() {
    using member_type = typename Kokkos::TeamPolicy<ExecSpace>::member_type;
    // FIXME_OPENMPTARGET temporary restriction for team size to be at least 32
#ifdef KOKKOS_ENABLE_OPENMPTARGET
    int team_size =
        std::is_same<ExecSpace, Kokkos::Experimental::OpenMPTarget>::value ? 32
                                                                           : 1;
#else
    int team_size      = 1;
#endif
    Kokkos::TeamPolicy<ExecSpace> policy(1, team_size);
    size_t scratch_size = sizeof(int);
    Kokkos::View<int, ExecSpace> flag("Flag");

    Kokkos::parallel_for(
        policy.set_scratch_size(0, Kokkos::PerTeam(scratch_size)),
        KOKKOS_LAMBDA(const member_type &team) {
          int *scratch_ptr = (int *)team.team_shmem().get_shmem(scratch_size);
          if (scratch_ptr == nullptr) flag() = 1;
        });
    Kokkos::fence();
    int minimal_scratch_allocation_failed = 0;
    Kokkos::deep_copy(minimal_scratch_allocation_failed, flag);
    ASSERT_EQ(minimal_scratch_allocation_failed, 0);
  }

  // test alignment of successive allocations
  void test_raw() {
    using member_type = typename Kokkos::TeamPolicy<ExecSpace>::member_type;
#ifdef KOKKOS_ENABLE_OPENMPTARGET
    int team_size =
        std::is_same<ExecSpace, Kokkos::Experimental::OpenMPTarget>::value ? 32
                                                                           : 1;
#else
    int team_size      = 1;
#endif
    Kokkos::TeamPolicy<ExecSpace> policy(1, team_size);
    Kokkos::View<int, ExecSpace> flag("Flag");

    Kokkos::parallel_for(
        policy.set_scratch_size(0, Kokkos::PerTeam(1024)),
        KOKKOS_LAMBDA(const member_type &team) {
          // first get some unaligned allocations, should give back
          // exactly the requested number of bytes
          auto scratch_ptr1 =
              reinterpret_cast<intptr_t>(team.team_shmem().get_shmem(24));
          auto scratch_ptr2 =
              reinterpret_cast<intptr_t>(team.team_shmem().get_shmem(32));
          auto scratch_ptr3 =
              reinterpret_cast<intptr_t>(team.team_shmem().get_shmem(12));

          if (((scratch_ptr2 - scratch_ptr1) != 24) ||
              ((scratch_ptr3 - scratch_ptr2) != 32))
            flag() = 1;

          // Now request aligned memory such that the allocation after
          // scratch_ptr2 would be unaligned if it doesn't pad correctly.
          // Depending on scratch_ptr3 being 4 or 8 byte aligned
          // we need to request a different amount of memory.
          if ((scratch_ptr3 + 12) % 8 == 4)
            scratch_ptr1 = reinterpret_cast<intptr_t>(
                team.team_shmem().get_shmem_aligned(24, 4));
          else {
            scratch_ptr1 = reinterpret_cast<intptr_t>(
                team.team_shmem().get_shmem_aligned(12, 4));
          }
          scratch_ptr2 = reinterpret_cast<intptr_t>(
              team.team_shmem().get_shmem_aligned(32, 8));
          scratch_ptr3 = reinterpret_cast<intptr_t>(
              team.team_shmem().get_shmem_aligned(8, 4));

          // The difference between scratch_ptr2 and scratch_ptr1 should be 4
          // bytes larger than what we requested in either case.
          if (((scratch_ptr2 - scratch_ptr1) != 28) &&
              ((scratch_ptr2 - scratch_ptr1) != 16))
            flag() = 1;
          // Check that there wasn't unneccessary padding happening. Since
          // scratch_ptr2 was allocated with a 32 byte request and scratch_ptr3
          // is then already aligned, its difference should match 32 bytes.
          if ((scratch_ptr3 - scratch_ptr2) != 32) flag() = 1;

          // check actually alignment of ptrs is as requested
          // cast to int here to avoid failure with icpx in mixed integer type
          // comparison
          if ((int(scratch_ptr1 % 4) != 0) || (int(scratch_ptr2 % 8) != 0) ||
              (int(scratch_ptr3 % 4) != 0))
            flag() = 1;
        });
    Kokkos::fence();
    int raw_get_shmem_alignment_failed = 0;
    Kokkos::deep_copy(raw_get_shmem_alignment_failed, flag);
    ASSERT_EQ(raw_get_shmem_alignment_failed, 0);
  }
};

}  // namespace

namespace {
template <class ExecSpace>
struct TestTeamPolicyHandleByValue {
  using scalar     = double;
  using exec_space = ExecSpace;
  using mem_space  = typename ExecSpace::memory_space;

  TestTeamPolicyHandleByValue() { test(); }

  void test() {
#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
    const int M = 1, N = 1;
    Kokkos::View<scalar **, mem_space> a("a", M, N);
    Kokkos::View<scalar **, mem_space> b("b", M, N);
    Kokkos::deep_copy(a, 0.0);
    Kokkos::deep_copy(b, 1.0);
    Kokkos::parallel_for(
        "test_tphandle_by_value",
        Kokkos::TeamPolicy<exec_space>(M, Kokkos::AUTO(), 1),
        KOKKOS_LAMBDA(
            const typename Kokkos::TeamPolicy<exec_space>::member_type team) {
          const int i = team.league_rank();
          Kokkos::parallel_for(Kokkos::TeamThreadRange(team, 0, N),
                               [&](const int j) { a(i, j) += b(i, j); });
        });
#endif
  }
};

}  // namespace

namespace {
template <typename ExecutionSpace>
struct TestRepeatedTeamReduce {
  static constexpr int ncol = 1500;  // nothing special, just some work

  KOKKOS_FUNCTION void operator()(
      const typename Kokkos::TeamPolicy<ExecutionSpace>::member_type &team)
      const {
    // non-divisible by power of two to make triggering problems easier
    constexpr int nlev = 129;
    constexpr auto pi  = Kokkos::numbers::pi;
    double b           = 0.;
    for (int ri = 0; ri < 10; ++ri) {
      // The contributions here must be sufficiently complex, simply adding ones
      // wasn't enough to trigger the bug.
      const auto g1 = [&](const int k, double &acc) {
        acc += Kokkos::cos(pi * double(k) / nlev);
      };
      const auto g2 = [&](const int k, double &acc) {
        acc += Kokkos::sin(pi * double(k) / nlev);
      };
      double a1, a2;
      Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team, nlev), g1, a1);
      Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team, nlev), g2, a2);
      b += a1;
      b += a2;
    }
    const auto h = [&]() {
      const auto col = team.league_rank();
      v(col)         = b + col;
    };
    Kokkos::single(Kokkos::PerTeam(team), h);
  }

  KOKKOS_FUNCTION void operator()(const int i, int &bad) const {
    if (v(i) != v(0) + i) {
      ++bad;
      Kokkos::printf("Failing at %d!\n", i);
    }
  }

  TestRepeatedTeamReduce() : v("v", ncol) { test(); }

  void test() {
    int team_size_recommended =
        Kokkos::TeamPolicy<ExecutionSpace>(1, 1).team_size_recommended(
            *this, Kokkos::ParallelForTag());
    // Choose a non-recommened (non-power of two for GPUs) team size
    int team_size = team_size_recommended > 1 ? team_size_recommended - 1 : 1;

    // The failure was non-deterministic so run the test a bunch of times
    for (int it = 0; it < 100; ++it) {
      Kokkos::parallel_for(
          Kokkos::TeamPolicy<ExecutionSpace>(ncol, team_size, 1), *this);

      int bad = 0;
      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecutionSpace>(0, ncol),
                              *this, bad);
      ASSERT_EQ(bad, 0) << " Failing in iteration " << it;
    }
  }

  Kokkos::View<double *, ExecutionSpace> v;
};

}  // namespace

}  // namespace Test

/*--------------------------------------------------------------------------*/
